{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/kaggle/input/sf-crime/test.csv.zip\n",
      "/kaggle/input/sf-crime/train.csv.zip\n",
      "/kaggle/input/sf-crime/sampleSubmission.csv.zip\n"
     ]
    }
   ],
   "source": [
    "# This Python 3 environment comes with many helpful analytics libraries installed\n",
    "# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n",
    "# For example, here's several helpful packages to load in \n",
    "\n",
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "\n",
    "# Input data files are available in the \"../input/\" directory.\n",
    "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n",
    "\n",
    "import os\n",
    "for dirname, _, filenames in os.walk('/kaggle/input'):\n",
    "    for filename in filenames:\n",
    "        print(os.path.join(dirname, filename))\n",
    "\n",
    "# Any results you write to the current directory are saved as output."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import something I need"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#作者：1621430024\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import torch\n",
    "from torch import nn\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.decomposition import PCA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "IS GPU available?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "torch.cuda.is_available()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = pd.read_csv('/kaggle/input/sf-crime/train.csv.zip', parse_dates=['Dates'])\n",
    "test_data = pd.read_csv('/kaggle/input/sf-crime/test.csv.zip', parse_dates=['Dates'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Data info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 878049 entries, 0 to 878048\n",
      "Data columns (total 9 columns):\n",
      "Dates         878049 non-null datetime64[ns]\n",
      "Category      878049 non-null object\n",
      "Descript      878049 non-null object\n",
      "DayOfWeek     878049 non-null object\n",
      "PdDistrict    878049 non-null object\n",
      "Resolution    878049 non-null object\n",
      "Address       878049 non-null object\n",
      "X             878049 non-null float64\n",
      "Y             878049 non-null float64\n",
      "dtypes: datetime64[ns](1), float64(2), object(6)\n",
      "memory usage: 60.3+ MB\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 884262 entries, 0 to 884261\n",
      "Data columns (total 7 columns):\n",
      "Id            884262 non-null int64\n",
      "Dates         884262 non-null datetime64[ns]\n",
      "DayOfWeek     884262 non-null object\n",
      "PdDistrict    884262 non-null object\n",
      "Address       884262 non-null object\n",
      "X             884262 non-null float64\n",
      "Y             884262 non-null float64\n",
      "dtypes: datetime64[ns](1), float64(2), int64(1), object(3)\n",
      "memory usage: 47.2+ MB\n"
     ]
    }
   ],
   "source": [
    "train_data.info()\n",
    "test_data.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Transform data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_features = pd.concat((train_data.iloc[:, [0, 3, 4, 6, 7, 8]],\n",
    "                          test_data.iloc[:, [1, 2, 3, 4, 5, 6]]),\n",
    "                         sort=False)\n",
    "\n",
    "num_train = train_data.shape[0]\n",
    "\n",
    "train_labels = pd.get_dummies(train_data['Category']).values\n",
    "num_outputs = train_labels.shape[1]\n",
    "\n",
    "all_features['year'] = all_features.Dates.dt.year\n",
    "all_features['month'] = all_features.Dates.dt.month\n",
    "all_features['new_year'] = all_features['month'].apply(\n",
    "    lambda x: 1 if x == 1 or x == 2 else 0)\n",
    "all_features['day'] = all_features.Dates.dt.day\n",
    "all_features['hour'] = all_features.Dates.dt.hour\n",
    "all_features['evening'] = all_features['hour'].apply(lambda x: 1\n",
    "                                                     if x >= 18 else 0)\n",
    "\n",
    "wkm = {\n",
    "    'Monday': 0,\n",
    "    'Tuesday': 1,\n",
    "    'Wednesday': 2,\n",
    "    'Thursday': 3,\n",
    "    'Friday': 4,\n",
    "    'Saturday': 5,\n",
    "    'Sunday': 6\n",
    "}\n",
    "all_features['DayOfWeek'] = all_features['DayOfWeek'].apply(lambda x: wkm[x])\n",
    "all_features['weekend'] = all_features['DayOfWeek'].apply(\n",
    "    lambda x: 1 if x == 4 or x == 5 else 0)\n",
    "\n",
    "OneHot_features = pd.get_dummies(all_features['PdDistrict'])\n",
    "\n",
    "all_features['block'] = all_features['Address'].apply(\n",
    "    lambda x: 1 if 'block' in x.lower() else 0)\n",
    "\n",
    "PCA_features = all_features[['X', 'Y']].values\n",
    "Standard_features = all_features[['DayOfWeek', 'year', 'month', 'day',\n",
    "                                  'hour']].values\n",
    "OneHot_features = pd.concat([\n",
    "    OneHot_features, all_features[['new_year', 'evening', 'weekend', 'block']]\n",
    "],\n",
    "                            axis=1).values\n",
    "\n",
    "scaler = StandardScaler()\n",
    "scaler.fit(Standard_features)\n",
    "Standard_features = scaler.transform(Standard_features)\n",
    "\n",
    "pca = PCA(n_components=2)\n",
    "pca.fit(PCA_features)\n",
    "PCA_features = pca.transform(PCA_features)\n",
    "\n",
    "all_features = np.concatenate(\n",
    "    (PCA_features, Standard_features, OneHot_features), axis=1)\n",
    "\n",
    "train_features = all_features[:num_train]\n",
    "num_inputs = train_features.shape[1]\n",
    "test_features = all_features[num_train:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Definition residual block"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Residual(nn.Module):\n",
    "    def __init__(self, num_inputs, num_outputs):\n",
    "        super(Residual, self).__init__()\n",
    "        self.middle_L = nn.Linear(num_inputs, num_outputs)\n",
    "        self.middle_R = nn.ReLU(num_outputs)\n",
    "        if num_inputs != num_outputs:\n",
    "            self.right = nn.Linear(num_inputs, num_outputs)\n",
    "        else:\n",
    "            self.right = None\n",
    "        self.middle_B = nn.BatchNorm1d(num_outputs)\n",
    "    def forward(self, X):\n",
    "        Y = self.middle_B(self.middle_R(self.middle_L(X)))\n",
    "        if self.right:\n",
    "            X = self.right(X)\n",
    "        return Y + X"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Definition net"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "class build_model(nn.Module):\n",
    "    def __init__(self, num_inputs, num_outputs, dp=0.5):\n",
    "        super(build_model, self).__init__()\n",
    "        self.net = nn.Sequential()\n",
    "        self.net.add_module('Residual1', Residual(num_inputs, 1024))\n",
    "        self.net.add_module('Residual2', Residual(1024, 512))\n",
    "        self.net.add_module('Residual3', Residual(512, 512))\n",
    "        self.net.add_module('Residual4', Residual(512, 256))\n",
    "        self.net.add_module('Dropout1', nn.Dropout(dp))\n",
    "        self.net.add_module('Residual5', Residual(256, 256))\n",
    "        self.net.add_module('Residual6', Residual(256, 128))\n",
    "        self.net.add_module('Residual7', Residual(128, 128))\n",
    "        self.net.add_module('Residual8', Residual(128, 64))\n",
    "        self.net.add_module('Dropout2', nn.Dropout(dp))\n",
    "        self.net.add_module('Residual9', Residual(64, 64))\n",
    "        self.net.add_module('Linear-out', nn.Linear(64, num_outputs))\n",
    "        self.net.add_module('Softmax', nn.Softmax(dim=-1))\n",
    "    def forward(self, x):\n",
    "        return self.net(x)\n",
    "\n",
    "\n",
    "net = build_model(num_inputs, num_outputs).cuda()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Definition loss function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "class MultiClassLogLoss(torch.nn.Module):\n",
    "    def __init__(self):\n",
    "        super(MultiClassLogLoss, self).__init__()\n",
    "    def forward(self, y_pred, y_true):\n",
    "        return -(y_true *\n",
    "                 torch.log(y_pred.float() + 1.00000000e-15)) / y_true.shape[0]\n",
    "\n",
    "\n",
    "loss = MultiClassLogLoss().cuda()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Some functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_iter(train_features, train_labels, batch_size):\n",
    "    train_features = torch.tensor(train_features, dtype=torch.float).cuda()\n",
    "    train_labels = torch.tensor(train_labels).cuda()\n",
    "    dataset = torch.utils.data.TensorDataset(train_features, train_labels)\n",
    "    return torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)\n",
    "\n",
    "\n",
    "def show_loss(net, loss, features, labels, team):\n",
    "    net.eval()\n",
    "    batch = make_iter(features, labels, 1024)\n",
    "    loss_num = 0\n",
    "    n = 0\n",
    "    for x, y in batch:\n",
    "        loss_num += loss(net(x), y).sum().item()\n",
    "        n += 1\n",
    "    print(team, end=' ')\n",
    "    print('loss:', loss_num / n)\n",
    "\n",
    "\n",
    "def train(features, labels, batch_size):\n",
    "    net.train()\n",
    "    train_iter = make_iter(features, labels, batch_size)\n",
    "    for X, y in train_iter:\n",
    "        y_hat = net(X)\n",
    "        l = loss(y_hat, y).sum()\n",
    "        optimizer.zero_grad()\n",
    "        l.backward()\n",
    "        optimizer.step()\n",
    "    show_loss(net, loss, features, labels, '训练集')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set up super parameter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "#num_epochs = 10\n",
    "num_epochs = 100\n",
    "k_fold_num = 5\n",
    "batch_size = 128\n",
    "lr = 0.001\n",
    "#k_fold = True\n",
    "k_fold = False"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Definition optimizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "optimizer = torch.optim.Adam(net.parameters(), lr=lr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Start up!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "第1轮：\n",
      "训练集 loss: 2.5550141128904613\n",
      "第2轮：\n",
      "训练集 loss: 2.5370598711889665\n",
      "第3轮：\n",
      "训练集 loss: 2.5096064944923064\n",
      "第4轮：\n",
      "训练集 loss: 2.4952176578672893\n",
      "第5轮：\n",
      "训练集 loss: 2.4762877092494833\n",
      "第6轮：\n",
      "训练集 loss: 2.4777173351296735\n",
      "第7轮：\n",
      "训练集 loss: 2.4727746223236298\n",
      "第8轮：\n",
      "训练集 loss: 2.478893978056652\n",
      "第9轮：\n",
      "训练集 loss: 2.4670776915161206\n",
      "第10轮：\n",
      "训练集 loss: 2.4526683709004544\n",
      "第11轮：\n",
      "训练集 loss: 2.466085512321312\n",
      "第12轮：\n",
      "训练集 loss: 2.4627729106894183\n",
      "第13轮：\n",
      "训练集 loss: 2.459905711643068\n",
      "第14轮：\n",
      "训练集 loss: 2.4510245389871663\n",
      "第15轮：\n",
      "训练集 loss: 2.459642517538893\n",
      "第16轮：\n",
      "训练集 loss: 2.4602036695658187\n",
      "第17轮：\n",
      "训练集 loss: 2.4307808723205175\n",
      "第18轮：\n",
      "训练集 loss: 2.454620939312559\n",
      "第19轮：\n",
      "训练集 loss: 2.442758888353557\n",
      "第20轮：\n",
      "训练集 loss: 2.4601917905963107\n",
      "第21轮：\n",
      "训练集 loss: 2.4548230073946615\n",
      "第22轮：\n",
      "训练集 loss: 2.4353819624249473\n",
      "第23轮：\n",
      "训练集 loss: 2.4596307005359854\n",
      "第24轮：\n",
      "训练集 loss: 2.4534673557414877\n",
      "第25轮：\n",
      "训练集 loss: 2.421764129525298\n",
      "第26轮：\n",
      "训练集 loss: 2.4201028063580705\n",
      "第27轮：\n",
      "训练集 loss: 2.423332574762109\n",
      "第28轮：\n",
      "训练集 loss: 2.4518459985306214\n",
      "第29轮：\n",
      "训练集 loss: 2.4236883594717455\n",
      "第30轮：\n",
      "训练集 loss: 2.429065228064299\n",
      "第31轮：\n",
      "训练集 loss: 2.4554725204592263\n",
      "第32轮：\n",
      "训练集 loss: 2.4212129235545516\n",
      "第33轮：\n",
      "训练集 loss: 2.4206792377369664\n",
      "第34轮：\n",
      "训练集 loss: 2.417930112018452\n",
      "第35轮：\n",
      "训练集 loss: 2.4221709516498593\n",
      "第36轮：\n",
      "训练集 loss: 2.412245632607342\n",
      "第37轮：\n",
      "训练集 loss: 2.4219715070057584\n",
      "第38轮：\n",
      "训练集 loss: 2.4414651591183145\n",
      "第39轮：\n",
      "训练集 loss: 2.4115030334823895\n",
      "第40轮：\n",
      "训练集 loss: 2.4236625949819603\n",
      "第41轮：\n",
      "训练集 loss: 2.4233262541688685\n",
      "第42轮：\n",
      "训练集 loss: 2.4406758846936527\n",
      "第43轮：\n",
      "训练集 loss: 2.4537293002322005\n",
      "第44轮：\n",
      "训练集 loss: 2.4218993109145086\n",
      "第45轮：\n",
      "训练集 loss: 2.436689530497109\n",
      "第46轮：\n",
      "训练集 loss: 2.4161767348265037\n",
      "第47轮：\n",
      "训练集 loss: 2.4103350797733225\n",
      "第48轮：\n",
      "训练集 loss: 2.4088531018414976\n",
      "第49轮：\n",
      "训练集 loss: 2.4153602181614695\n",
      "第50轮：\n",
      "训练集 loss: 2.437480047866181\n",
      "第51轮：\n",
      "训练集 loss: 2.4147872708060523\n",
      "第52轮：\n",
      "训练集 loss: 2.413058780012153\n",
      "第53轮：\n",
      "训练集 loss: 2.3983299176453987\n",
      "第54轮：\n",
      "训练集 loss: 2.4017825246135116\n",
      "第55轮：\n",
      "训练集 loss: 2.4038575779307974\n",
      "第56轮：\n",
      "训练集 loss: 2.408930411427727\n",
      "第57轮：\n",
      "训练集 loss: 2.4119064502227\n",
      "第58轮：\n",
      "训练集 loss: 2.4092768569648406\n",
      "第59轮：\n",
      "训练集 loss: 2.399180961933447\n",
      "第60轮：\n",
      "训练集 loss: 2.3931028867934967\n",
      "第61轮：\n",
      "训练集 loss: 2.436948234106833\n",
      "第62轮：\n",
      "训练集 loss: 2.433103249345348\n",
      "第63轮：\n",
      "训练集 loss: 2.3907290283060854\n",
      "第64轮：\n",
      "训练集 loss: 2.39958612818818\n",
      "第65轮：\n",
      "训练集 loss: 2.433995560610489\n",
      "第66轮：\n",
      "训练集 loss: 2.4062346436760644\n",
      "第67轮：\n",
      "训练集 loss: 2.3958036696716345\n",
      "第68轮：\n",
      "训练集 loss: 2.3941285779704025\n",
      "第69轮：\n",
      "训练集 loss: 2.4245097136997678\n",
      "第70轮：\n",
      "训练集 loss: 2.4304990507228115\n",
      "第71轮：\n",
      "训练集 loss: 2.4172315864296228\n",
      "第72轮：\n",
      "训练集 loss: 2.3948843982114103\n",
      "第73轮：\n",
      "训练集 loss: 2.3869522462635886\n",
      "第74轮：\n",
      "训练集 loss: 2.3903139571209886\n",
      "第75轮：\n",
      "训练集 loss: 2.379154855276877\n",
      "第76轮：\n",
      "训练集 loss: 2.391589704649154\n",
      "第77轮：\n",
      "训练集 loss: 2.409383744348735\n",
      "第78轮：\n",
      "训练集 loss: 2.3803074987896116\n",
      "第79轮：\n",
      "训练集 loss: 2.3769047868835345\n",
      "第80轮：\n",
      "训练集 loss: 2.377736467303652\n",
      "第81轮：\n",
      "训练集 loss: 2.3864275497712177\n",
      "第82轮：\n",
      "训练集 loss: 2.382210192424712\n",
      "第83轮：\n",
      "训练集 loss: 2.418196870174719\n",
      "第84轮：\n",
      "训练集 loss: 2.4188317763499723\n",
      "第85轮：\n",
      "训练集 loss: 2.424576505914435\n",
      "第86轮：\n",
      "训练集 loss: 2.378486261779056\n",
      "第87轮：\n",
      "训练集 loss: 2.4177103356603697\n",
      "第88轮：\n",
      "训练集 loss: 2.385381296798066\n",
      "第89轮：\n",
      "训练集 loss: 2.387670964349956\n",
      "第90轮：\n",
      "训练集 loss: 2.3727046648661294\n",
      "第91轮：\n",
      "训练集 loss: 2.4134620736528944\n",
      "第92轮：\n",
      "训练集 loss: 2.386697290660618\n",
      "第93轮：\n",
      "训练集 loss: 2.3624164571851005\n",
      "第94轮：\n",
      "训练集 loss: 2.371856701957596\n",
      "第95轮：\n",
      "训练集 loss: 2.3610207539894086\n",
      "第96轮：\n",
      "训练集 loss: 2.414985478341163\n",
      "第97轮：\n",
      "训练集 loss: 2.3737402151236724\n",
      "第98轮：\n",
      "训练集 loss: 2.3712116860565327\n",
      "第99轮：\n",
      "训练集 loss: 2.3537856418213923\n",
      "第100轮：\n",
      "训练集 loss: 2.380801826646\n"
     ]
    }
   ],
   "source": [
    "if k_fold:\n",
    "    kf = KFold(n_splits=k_fold_num, shuffle=True)\n",
    "    for epoch in range(num_epochs):\n",
    "        fold_num = 0\n",
    "        for train_index, test_index in kf.split(train_features):\n",
    "            X_train, X_test = train_features[train_index], train_features[\n",
    "                test_index]\n",
    "            y_train, y_test = train_labels[train_index], train_labels[\n",
    "                test_index]\n",
    "            print('第%d轮的第%d折：' % (epoch + 1, fold_num + 1))\n",
    "            fold_num += 1\n",
    "            train(X_train, y_train, batch_size)\n",
    "            show_loss(net, loss, X_test, y_test, '测试集')\n",
    "else:\n",
    "    for epoch in range(num_epochs):\n",
    "        print('第%d轮：' % (epoch + 1))\n",
    "        train(train_features, train_labels, batch_size)\n",
    "\n",
    "    net.eval()\n",
    "    test_iter = torch.utils.data.DataLoader(torch.tensor(test_features,\n",
    "                                                         dtype=torch.float).cuda(),\n",
    "                                            1024,\n",
    "                                            shuffle=False)\n",
    "    testResult = [line for x in test_iter for line in net(x).cpu().detach().numpy()]\n",
    "    sampleSubmission = pd.read_csv('/kaggle/input/sf-crime/sampleSubmission.csv.zip')\n",
    "    Result_pd = pd.DataFrame(testResult,\n",
    "                             index=sampleSubmission.index,\n",
    "                             columns=sampleSubmission.columns[1:])\n",
    "    Result_pd.to_csv('/kaggle/working/sampleSubmission(v0.5).csv', index_label='Id')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
